Group 6 (Script file)¶

In [1]:
# Importing libraries
%matplotlib inline
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import seaborn as sns
from scipy.stats import pearsonr
import plotly.express as px
import plotly.graph_objects as go

# Regression models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Cluster model
from sklearn.cluster import KMeans

# Evaluation tools
from sklearn.metrics import ConfusionMatrixDisplay

# Cross-validation
from sklearn.model_selection import cross_val_score

Preparing the dataset for use; Merging files for the period January - December 2023¶

In [2]:
# Path to directory containing CSV files for each month
path = "./Dataset"

# Getting list of CSV files in the directory
all_files = glob.glob(path + "/dublinbike-historical-data-*.csv")

# List to store DataFrames for each month
dfs = []

# Iterating over each CSV file
for filename in all_files:
    # Reading CSV file into a DataFrame
    df = pd.read_csv(filename)
    
    # Extracting month from the filename
    month = filename.split('-')[-1].split('.')[0]
    
    # Adding a new column for month
    df['Month'] = month
    
    # Appending DataFrame to the list
    dfs.append(df)

# Concatenating all DataFrames into one
merged_df = pd.concat(dfs, ignore_index=True)

# Printing number of rows in merged dataset
print("\nThe total rows in merged dataset is:", len(merged_df))

rows_in_each_dataset = [len(df) for df in dfs]

# Printing number of rows in each dataset
print("\nNumber of rows in each individual dataset:")
count = 0
for i, rows in enumerate(rows_in_each_dataset, start=1):
    print(f"Dataset {i}: {rows} rows")
    count = count + rows
print("\nThe total rows are:", count)

# Printing the head of the merged dataset to get an overview
merged_df.head()

# Checking for missing values in key columns
print("\nMissing values in each column:\n", merged_df.isnull().sum())

# Saving the merged DataFrame to a new CSV file (as backup)
merged_df.to_csv('merged_data.csv', index=False)
The total rows in merged dataset is: 1994400

Number of rows in each individual dataset:
Dataset 1: 168144 rows
Dataset 2: 152903 rows
Dataset 3: 169844 rows
Dataset 4: 164160 rows
Dataset 5: 169367 rows
Dataset 6: 163590 rows
Dataset 7: 169632 rows
Dataset 8: 169404 rows
Dataset 9: 164046 rows
Dataset 10: 169746 rows
Dataset 11: 164046 rows
Dataset 12: 169518 rows

The total rows are: 1994400

Missing values in each column:
 STATION ID               0
TIME                     0
LAST UPDATED             0
NAME                     0
BIKE_STANDS              0
AVAILABLE_BIKE_STANDS    0
AVAILABLE_BIKES          0
STATUS                   0
ADDRESS                  0
LATITUDE                 0
LONGITUDE                0
Month                    0
dtype: int64

Defining the peak hours and subsetting the dataset accordingly:¶

  • Monday - Friday (7am - 10am, 12pm - 13pm, 17pm - 20pm);
  • Saturday - Sunday (10am - 17pm)
In [3]:
# Replacing 'datetime_column' with the name of actual datetime column
merged_df['datetime_column'] = pd.to_datetime(merged_df['TIME'])

# Creating a column for the day of the week (Monday=0, Sunday=6)
merged_df['day_of_week'] = merged_df['datetime_column'].dt.dayofweek

# Creating a column for the hour of the day
merged_df['hour_of_day'] = merged_df['datetime_column'].dt.hour

# Defining a function to check if a given row matches the desired time slots
def check_time_slot(row):
    if row['day_of_week'] <= 4:  # Monday to Friday
        return ((7 <= row['hour_of_day'] < 10) or (12 <= row['hour_of_day'] < 13) or (17 <= row['hour_of_day'] < 20))
    else:  # Saturday and Sunday
        return (10 <= row['hour_of_day'] < 17)

# Applying the function to filter the rows
filtered_df = merged_df[merged_df.apply(check_time_slot, axis=1)]

# Mapping from integers to weekday names
day_of_week_map = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

# Creating a new column 'weekday_name' by applying the mapping to the 'day_of_week' column
filtered_df['weekday_name'] = filtered_df['day_of_week'].map(day_of_week_map)

# Printing the head of the modified dataset to get an overview
filtered_df.head()

# Printing the number of rows in filtered dataset
print("The total rows in filtered dataset is:",len(filtered_df))

# Printing the names of the columns in the filtered dataset
print("\nColumn names in the filtered dataset:", list(filtered_df.columns))

# Saving the filtered dataset for further use (as backup)
filtered_df.to_csv('filtered_data.csv', index=False)
C:\Users\Dell\AppData\Local\Temp\ipykernel_43472\4262330997.py:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['weekday_name'] = filtered_df['day_of_week'].map(day_of_week_map)
The total rows in filtered dataset is: 581693

Column names in the filtered dataset: ['STATION ID', 'TIME', 'LAST UPDATED', 'NAME', 'BIKE_STANDS', 'AVAILABLE_BIKE_STANDS', 'AVAILABLE_BIKES', 'STATUS', 'ADDRESS', 'LATITUDE', 'LONGITUDE', 'Month', 'datetime_column', 'day_of_week', 'hour_of_day', 'weekday_name']

Task 1¶

In [4]:
# Loading the dataset for use
# Note that the data that was prepared for analysis (filtered_data.csv) is used
dataset = pd.read_csv("filtered_data.csv")

# Removing stations that has "Closed" status and also removing STATION ID = 507 (test station)
unique_statuses = dataset['STATUS'].unique()
print("The unique status of the stations are:", unique_statuses) # Print the unique values
closed_count = (dataset['STATUS'] == 'CLOSED').sum() # Count occurrences of "CLOSED" in the STATUS column
print("\nNumber of occurrences of 'CLOSED' in STATUS column:", closed_count) # Print the count
closed_stations = dataset[dataset['STATUS'] == 'CLOSED'] # Filter the dataset for rows where STATUS is "CLOSED"
dataset_unique = dataset[dataset['STATUS'] != 'CLOSED'] # Drop rows where STATUS is "CLOSED"

# Filtering the dataset for rows where STATION ID is 507
dataset_unique = dataset_unique[dataset_unique['STATION ID'] != 507]

print("\n",dataset_unique.head()) # Print the head of the updated dataset
The unique status of the stations are: ['OPEN' 'CLOSED']

Number of occurrences of 'CLOSED' in STATUS column: 70

    STATION ID                 TIME         LAST UPDATED                NAME  \
0           1  2023-01-01 10:00:03  2023-01-01 09:57:04       CLARENDON ROW   
1           2  2023-01-01 10:00:03  2023-01-01 09:55:57  BLESSINGTON STREET   
2           3  2023-01-01 10:00:03  2023-01-01 09:51:11       BOLTON STREET   
3           4  2023-01-01 10:00:03  2023-01-01 09:51:33        GREEK STREET   
4           5  2023-01-01 10:00:03  2023-01-01 09:58:41    CHARLEMONT PLACE   

   BIKE_STANDS  AVAILABLE_BIKE_STANDS  AVAILABLE_BIKES STATUS  \
0           31                     29                2   OPEN   
1           20                     18                2   OPEN   
2           20                      9               11   OPEN   
3           20                      6               14   OPEN   
4           40                     16               24   OPEN   

              ADDRESS  LATITUDE  LONGITUDE  Month      datetime_column  \
0       Clarendon Row   53.3409   -6.26250      1  2023-01-01 10:00:03   
1  Blessington Street   53.3568   -6.26814      1  2023-01-01 10:00:03   
2       Bolton Street   53.3512   -6.26986      1  2023-01-01 10:00:03   
3        Greek Street   53.3469   -6.27298      1  2023-01-01 10:00:03   
4   Charlemont Street   53.3307   -6.26018      1  2023-01-01 10:00:03   

   day_of_week  hour_of_day weekday_name  
0            6           10       Sunday  
1            6           10       Sunday  
2            6           10       Sunday  
3            6           10       Sunday  
4            6           10       Sunday  
In [5]:
# Convert 'datetime_column' to datetime
dataset_unique['datetime_column'] = pd.to_datetime(dataset_unique['datetime_column'])

# Extracting year, month, and day
dataset_unique['Year'] = dataset_unique['datetime_column'].dt.year
dataset_unique['Month'] = dataset_unique['datetime_column'].dt.month
dataset_unique['Day'] = dataset_unique['datetime_column'].dt.day

# Grouping by 'STATION ID', year, month, and day, and calculating mean of 'AVAILABLE_BIKE_STANDS' and 'BIKE_STANDS'
daily_means = dataset_unique.groupby(['STATION ID', 'Year', 'Month', 'Day']).agg({'AVAILABLE_BIKE_STANDS': 'mean', 'BIKE_STANDS': 'mean'}).reset_index()

# Calculating usage percentage for each station on a daily basis
daily_means['Usage Percentage'] = (daily_means['BIKE_STANDS'] - daily_means['AVAILABLE_BIKE_STANDS']) / daily_means['BIKE_STANDS'] * 100

# Merging the 'Usage Percentage' column from daily_means to the original dataset
dataset_unique = pd.merge(dataset_unique, daily_means[['STATION ID', 'Year', 'Month', 'Day', 'Usage Percentage']], on=['STATION ID', 'Year', 'Month', 'Day'], how='left')

# Removing duplicate cases based on 'Month', 'Day', and 'STATION ID'
dataset_unique = dataset_unique.drop_duplicates(subset=['Month', 'Day', 'STATION ID'])

# Displaying the head of the updated dataset
print(dataset_unique.head())
   STATION ID                 TIME         LAST UPDATED                NAME  \
0           1  2023-01-01 10:00:03  2023-01-01 09:57:04       CLARENDON ROW   
1           2  2023-01-01 10:00:03  2023-01-01 09:55:57  BLESSINGTON STREET   
2           3  2023-01-01 10:00:03  2023-01-01 09:51:11       BOLTON STREET   
3           4  2023-01-01 10:00:03  2023-01-01 09:51:33        GREEK STREET   
4           5  2023-01-01 10:00:03  2023-01-01 09:58:41    CHARLEMONT PLACE   

   BIKE_STANDS  AVAILABLE_BIKE_STANDS  AVAILABLE_BIKES STATUS  \
0           31                     29                2   OPEN   
1           20                     18                2   OPEN   
2           20                      9               11   OPEN   
3           20                      6               14   OPEN   
4           40                     16               24   OPEN   

              ADDRESS  LATITUDE  LONGITUDE  Month     datetime_column  \
0       Clarendon Row   53.3409   -6.26250      1 2023-01-01 10:00:03   
1  Blessington Street   53.3568   -6.26814      1 2023-01-01 10:00:03   
2       Bolton Street   53.3512   -6.26986      1 2023-01-01 10:00:03   
3        Greek Street   53.3469   -6.27298      1 2023-01-01 10:00:03   
4   Charlemont Street   53.3307   -6.26018      1 2023-01-01 10:00:03   

   day_of_week  hour_of_day weekday_name  Year  Day  Usage Percentage  
0            6           10       Sunday  2023    1          7.603687  
1            6           10       Sunday  2023    1         27.142857  
2            6           10       Sunday  2023    1         46.428571  
3            6           10       Sunday  2023    1         63.214286  
4            6           10       Sunday  2023    1         58.214286  

Note usage percentage refers to percentage of available bikes. Thus, lower usage percentage value refers to more availability of the stands and higher utilization of bikes.¶

We now run various machine learning models to understand which model makes better predictions of usage level given the set of features.¶

In [6]:
# Setting the target variable
y = dataset_unique['Usage Percentage']

# Setting the feature variable
X_new = dataset_unique[['STATION ID', 'AVAILABLE_BIKES', 'day_of_week']]

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

models = {
    "Ridge Base": Ridge(),
    "Lasso Base": Lasso(),
    "ElasticNet Base": ElasticNet(),
    'Linear Regression': LinearRegression(), # Linear Regression
    'Random Forest Regressor': RandomForestRegressor(random_state=42) # Random Forest Regressor
}

# Calculating the MSE value for all the models used
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
    print(f"{name} (MSE_Test) = {mse_test}")
    print(f"{name} (MSE_Train) = {mse_train}")
Ridge Base (MSE_Test) = 296.7166429908603
Ridge Base (MSE_Train) = 285.8666307351733
Lasso Base (MSE_Test) = 297.12323957199254
Lasso Base (MSE_Train) = 286.12717398594134
ElasticNet Base (MSE_Test) = 296.96748896892075
ElasticNet Base (MSE_Train) = 285.9860569449409
Linear Regression (MSE_Test) = 296.7166390299895
Linear Regression (MSE_Train) = 285.8666307351055
Random Forest Regressor (MSE_Test) = 174.63904270377736
Random Forest Regressor (MSE_Train) = 77.43153515587545

On running various regression models which makes predictions for the usage percentage of the stations, we see that the Random Forest Regressor Model gives the best result. We therefore, use Random Forest Regressor Model for our case.¶

We next run a plot between predicted value and actual value to compare and check if our model is overfitting or not.¶

In [7]:
# Making predictions on the test set using the Random Forest Regressor model
y_pred_rf = models['Random Forest Regressor'].predict(X_test)

# Plotting graph to check the spread of fitted and actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, color='blue', label='Predicted usage')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Actual usage')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Random Forest Regressor Predictions vs. Actual Values of Usage')
plt.legend()
plt.grid(True)
plt.show()

We see that the Random Forest Regressor model is not overfitting and has multiple points that are far away from the predicted model and so is a good enough model to be used.¶

So, we then use this model to make predictions for all the stations. We also add the predicted values to our dataset for use.¶

In [8]:
# Making predictions for all stations
dataset_unique['Usage_predictions'] = models['Random Forest Regressor'].predict(X_new)

# Removing additional columns and duplicate cases 
# Note that each row in the dataset corresponds to daily average usage level of bikes for a station
data_pred = dataset_unique.groupby('STATION ID').agg({
    'NAME': 'first',
    'BIKE_STANDS': 'mean',
    'AVAILABLE_BIKE_STANDS': 'mean',
    'AVAILABLE_BIKES': 'mean',
    'LATITUDE': 'mean',
    'LONGITUDE': 'mean',
    'Usage_predictions': 'mean'}).reset_index()

print('The dataset with predicted values for daily usage level for each station is:\n', data_pred)

# Selecting top 5 and bottom 5 stations in terms of usage
top_5_predicted = data_pred.nlargest(5, 'Usage_predictions')
bottom_5_predicted = data_pred.nsmallest(5, 'Usage_predictions')

print('\nPREDICTED: TOP 5 STATIONS\n', top_5_predicted[['STATION ID', 'NAME', 'Usage_predictions']])
print('\nPREDICTED: BOTTOM 5 STATIONS\n', bottom_5_predicted[['STATION ID', 'NAME', 'Usage_predictions']])

# Saving the dataset for further use (Task 2)
data_pred.to_csv('dataset_with_predictions.csv', index=False)
The dataset with predicted values for daily usage level for each station is:
      STATION ID                   NAME  BIKE_STANDS  AVAILABLE_BIKE_STANDS  \
0             1          CLARENDON ROW         31.0              16.961644   
1             2     BLESSINGTON STREET         20.0              14.687671   
2             3          BOLTON STREET         20.0              12.476712   
3             4           GREEK STREET         20.0              11.194521   
4             5       CHARLEMONT PLACE         40.0              12.410959   
..          ...                    ...          ...                    ...   
109         113   MERRION SQUARE SOUTH         40.0              30.624658   
110         114  WILTON TERRACE (PARK)         40.0              27.139726   
111         115       KILLARNEY STREET         30.0               8.564384   
112         116             BROADSTONE         30.0              27.895890   
113         117      HANOVER QUAY EAST         40.0              31.742466   

     AVAILABLE_BIKES  LATITUDE  LONGITUDE  Usage_predictions  
0          14.024658   53.3409   -6.26250          50.898496  
1           5.304110   53.3568   -6.26814          18.881614  
2           7.506849   53.3512   -6.26986          37.645840  
3           8.775342   53.3469   -6.27298          47.050910  
4          27.553425   53.3307   -6.26018          42.660139  
..               ...       ...        ...                ...  
109         7.463014   53.3386   -6.24861          27.959888  
110        12.490411   53.3337   -6.24834          40.782061  
111        21.402740   53.3548   -6.24758          46.850345  
112         2.095890   53.3547   -6.27231           5.300500  
113         8.241096   53.3437   -6.23175          30.305323  

[114 rows x 8 columns]

PREDICTED: TOP 5 STATIONS
     STATION ID                               NAME  Usage_predictions
88          92             HEUSTON BRIDGE (NORTH)          68.527906
9           10                        DAME STREET          66.885357
32          33  PRINCES STREET / O'CONNELL STREET          66.261155
22          23                       CUSTOM HOUSE          62.728942
21          22                    TOWNSEND STREET          61.522133

PREDICTED: BOTTOM 5 STATIONS
      STATION ID                          NAME  Usage_predictions
112         116                    BROADSTONE           5.300500
107         111          MOUNTJOY SQUARE EAST           6.458657
85           89       FITZWILLIAM SQUARE EAST          13.520390
100         104  GRANGEGORMAN LOWER (CENTRAL)          15.565386
59           61               HARDWICKE PLACE          15.660697
In [9]:
# Plotting heatmap corresponding to daily average usage level of bikes for each station
fig = px.scatter_mapbox(data_pred, 
                        lat='LATITUDE', 
                        lon='LONGITUDE',
                        hover_name='NAME',
                        hover_data=['Usage_predictions', 'STATION ID'],
                        color='Usage_predictions',
                        size='Usage_predictions',
                        size_max=20,
                        color_continuous_scale='Jet',
                        mapbox_style="carto-positron",
                        zoom=12)

# Updating layout
fig.update_layout(title='Daily Average Usage Percentage Heatmap for Dublin Bikes')

# Showing the figure
fig.show()

Task 2¶

In [10]:
# Loading the dataset for use
data_pred = pd.read_csv("dataset_with_predictions.csv")
print(data_pred) #Printing the dataset to visualize
     STATION ID                   NAME  BIKE_STANDS  AVAILABLE_BIKE_STANDS  \
0             1          CLARENDON ROW         31.0              16.961644   
1             2     BLESSINGTON STREET         20.0              14.687671   
2             3          BOLTON STREET         20.0              12.476712   
3             4           GREEK STREET         20.0              11.194521   
4             5       CHARLEMONT PLACE         40.0              12.410959   
..          ...                    ...          ...                    ...   
109         113   MERRION SQUARE SOUTH         40.0              30.624658   
110         114  WILTON TERRACE (PARK)         40.0              27.139726   
111         115       KILLARNEY STREET         30.0               8.564384   
112         116             BROADSTONE         30.0              27.895890   
113         117      HANOVER QUAY EAST         40.0              31.742466   

     AVAILABLE_BIKES  LATITUDE  LONGITUDE  Usage_predictions  
0          14.024658   53.3409   -6.26250          50.898496  
1           5.304110   53.3568   -6.26814          18.881614  
2           7.506849   53.3512   -6.26986          37.645840  
3           8.775342   53.3469   -6.27298          47.050910  
4          27.553425   53.3307   -6.26018          42.660139  
..               ...       ...        ...                ...  
109         7.463014   53.3386   -6.24861          27.959888  
110        12.490411   53.3337   -6.24834          40.782061  
111        21.402740   53.3548   -6.24758          46.850345  
112         2.095890   53.3547   -6.27231           5.300500  
113         8.241096   53.3437   -6.23175          30.305323  

[114 rows x 8 columns]
In [11]:
# Feature set for clustering
X = data_pred[['Usage_predictions', 'LATITUDE', 'LONGITUDE']]

# Specifying the number of clusters
n_clusters = 6

# Performing K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)

# Getting the cluster labels
labels = kmeans.labels_

# Adding cluster labels to the DataFrame
data_pred['cluster_kmeans'] = labels
data_pred['cluster_kmeans'] = data_pred['cluster_kmeans'].astype(str)

# Setting the colour scheme for the clusters
cluster_colors1 = {
    '0': 'green',#green
    '1': 'red', #red
    '2': 'orange', #orange
    '3': 'blue', #blue
    '4': 'brown', #brown
    '5': 'black'#black
}

# Plotting with Plotly Express
fig = px.scatter_mapbox(data_pred,
                        lat="LATITUDE",
                        lon="LONGITUDE",
                        color="cluster_kmeans", 
                        color_discrete_map=cluster_colors1,
                        size="Usage_predictions",
                        size_max=15,
                        zoom=12,
                        mapbox_style="carto-positron",
                        title="KMeans Dublin Clusters (6)"
                        )

fig.show()
C:\Users\Dell\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1446: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

We see that the k-means cluster model isn't the best when it comes to clustering stations based on proximity and usage-level. The clusters prepared by the model has a wide spread across the city. We therefore use a manual clustering approach to help make optimal business decision.¶

We use a cluster-weighted model to understand the load of each station and accordingly make usage predictions.¶

Steps followed are as follows:¶

  • Identify 3 clusters of stations which have high daily usage percentages.
  • Calculate the total demand for bikes in the cluster (average demand in a day) and then calculate weights for each station in the cluster. The weights will help us understand how much of the total demand of the cluster is met by a particular station.
  • We then add a new station somewhere in the cluster having a total capacity of bikes = average capacity of all stations in the cluster.
  • We then calculate the revised weights for the stations (including the new station) keeping the total demand for the bikes in the cluster constant. We hope to see reduction in initial weights because of the addition of the new station.
  • We follow all the above steps for 3 low usage clusters after dropping five stations.
  • Lastly, we use the new weights and station ID as our input feature to predict the new usage percentages.
In [12]:
# Creating clusters for high and low usage areas

# Defining the conditions and corresponding cluster values
conditions = [
    data_pred['STATION ID'].isin([93, 100, 92, 86, 94]),
    data_pred['STATION ID'].isin([9, 14, 10, 1, 27, 29, 6, 17]),
    data_pred['STATION ID'].isin([16, 23, 22, 33, 32, 21, 8, 38, 64, 99]),
    data_pred['STATION ID'].isin([103, 104, 105, 102, 116, 110, 2, 30, 12]),
    data_pred['STATION ID'].isin([111, 112, 28, 44, 60, 59, 61, 79, 78, 15]),
    data_pred['STATION ID'].isin([89, 13, 20, 39, 114, 19, 47])
]

cluster_values = [1, 2, 3, 4, 5, 6]

# Setting the colour scheme for the clusters
cluster_colors2 = {
    1: 'red',
    2: 'orange',
    3: 'brown',
    4: 'blue',
    5: 'green',
    6: 'black'
}

# Creating the 'Cluster' variable based on the conditions
data_pred['Cluster'] = np.select(conditions, cluster_values, default=None)

print(data_pred.head()) #Printing the head of the modified dataset

# Plotting heatmap corresponding to daily average usage level of bikes for each station
fig = px.scatter_mapbox(data_pred, 
                        lat='LATITUDE', 
                        lon='LONGITUDE',
                        hover_name='NAME',
                        hover_data=['Usage_predictions', 'STATION ID', 'Cluster'],
                        color='Cluster',  
                        color_discrete_map=cluster_colors2,  
                        size='Usage_predictions',
                        size_max=20,
                        mapbox_style="carto-positron",
                        zoom=12)

# Updating layout
fig.update_layout(
    title='Clusters for weight',
    legend_title='Cluster',  # Set legend title
    legend=dict(
        title='Cluster',
        traceorder='normal',  # Ensure numerical order
        ),
    mapbox=dict(
        center=dict(lat=data_pred['LATITUDE'].mean(), lon=data_pred['LONGITUDE'].mean()),  # Set map center
        zoom=12  # Set initial zoom level
    )
)

# Showing the figure
fig.show()
   STATION ID                NAME  BIKE_STANDS  AVAILABLE_BIKE_STANDS  \
0           1       CLARENDON ROW         31.0              16.961644   
1           2  BLESSINGTON STREET         20.0              14.687671   
2           3       BOLTON STREET         20.0              12.476712   
3           4        GREEK STREET         20.0              11.194521   
4           5    CHARLEMONT PLACE         40.0              12.410959   

   AVAILABLE_BIKES  LATITUDE  LONGITUDE  Usage_predictions cluster_kmeans  \
0        14.024658   53.3409   -6.26250          50.898496              1   
1         5.304110   53.3568   -6.26814          18.881614              4   
2         7.506849   53.3512   -6.26986          37.645840              0   
3         8.775342   53.3469   -6.27298          47.050910              3   
4        27.553425   53.3307   -6.26018          42.660139              3   

  Cluster  
0       2  
1       4  
2    None  
3    None  
4    None  

Note that clusters 1 (red), 2 (orange), and 3 (brown) are high usage clusters i.e., these clusters mostly contain stations with relatively high average daily usage level of bikes (ones with big circles) alongwith some other stations that are nearby. These are the clusters where at least one new station will be added in each.¶

Clusters 4 (blue), 5 (green), and 6 (black) are low usage clusters i.e., these clusters contain the stations with relatively low average daily usage level of bikes (ones with small circles) and will be later dropped/removed from the clusters.¶

In [13]:
# Removing rows with 'None' values in the 'Cluster' column
data = data_pred.dropna(subset=['Cluster'])

# Calculating total bike stands in each cluster
total_bike_stands_cluster = data.groupby('Cluster')['BIKE_STANDS'].sum()
print('The total bike stands in each cluster is:\n', total_bike_stands_cluster)

# Calculating total available bikes in each cluster using 'sum()' instead of 'mean()'
total_available_bikes_cluster = data.groupby('Cluster')['AVAILABLE_BIKES'].sum()
print('\nThe total available bikes in each cluster is:\n', total_available_bikes_cluster)

# Iterating over unique cluster IDs
for cluster_id in data['Cluster'].unique():
    # Filtering data for stations belonging to the current cluster
    cluster_data = data[data['Cluster'] == cluster_id]
    
    # Calculating 'weight_initial' for stations in the current cluster
    data.loc[data['Cluster'] == cluster_id, 'Weight_initial'] = \
        cluster_data['AVAILABLE_BIKES'] * total_bike_stands_cluster.loc[cluster_id] / total_available_bikes_cluster.loc[cluster_id]

# Printing the head of the DataFrame with 'Weight_initial' column
print('\nThe dataset with initial weight values is:\n', data.head())
The total bike stands in each cluster is:
 Cluster
1    183.0
2    190.0
3    293.0
4    286.0
5    288.0
6    230.0
Name: BIKE_STANDS, dtype: float64

The total available bikes in each cluster is:
 Cluster
1    129.810959
2     77.578082
3    127.309589
4     56.827397
5     79.800000
6     47.367123
Name: AVAILABLE_BIKES, dtype: float64

The dataset with initial weight values is:
    STATION ID                NAME  BIKE_STANDS  AVAILABLE_BIKE_STANDS  \
0           1       CLARENDON ROW         31.0              16.961644   
1           2  BLESSINGTON STREET         20.0              14.687671   
5           6  CHRISTCHURCH PLACE         20.0              13.504110   
7           8   CUSTOM HOUSE QUAY         30.0              15.736986   
8           9    EXCHEQUER STREET         24.0              13.287671   

   AVAILABLE_BIKES  LATITUDE  LONGITUDE  Usage_predictions cluster_kmeans  \
0        14.024658   53.3409   -6.26250          50.898496              1   
1         5.304110   53.3568   -6.26814          18.881614              4   
5         6.432877   53.3434   -6.27012          29.910910              2   
7         9.128767   53.3479   -6.24805          50.995859              1   
8        10.652055   53.3430   -6.26358          58.496224              1   

  Cluster  Weight_initial  
0       2       34.348425  
1       4       26.694436  
5       2       15.755050  
7       3       21.009641  
8       2       26.088431  
C:\Users\Dell\AppData\Local\Temp\ipykernel_43472\1645359863.py:18: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [14]:
# Calculating average bike stands and available bikes for each cluster
average_bike_stands_cluster = data.groupby('Cluster')['BIKE_STANDS'].mean()

# Creating a new DataFrame to store the new rows
new_rows = pd.DataFrame(columns=data.columns)

# List of cluster IDs to iterate over
clusters_to_iterate = [1, 2, 3]

# Latitude and longitude values for the new rows
# Note that Google Streetview was used to identify exact location points on the map that optimise user experience 
new_row_coords = [
    (53.346459599999996, -6.292984499999996),
    (53.34420817918841, -6.260144986133722),
    (53.34742644494483, -6.255404095167318)
]

# Iterating over unique cluster IDs
for idx, cluster_id in enumerate(clusters_to_iterate):
    # Creating a new row with average values for bike stands and available bikes
    new_row = {
        'STATION ID': f'new_{idx+1}',  
        'NAME': f'new_{idx+1}',  
        'BIKE_STANDS': average_bike_stands_cluster[cluster_id],
        'AVAILABLE_BIKE_STANDS': 0,
        'AVAILABLE_BIKES': average_bike_stands_cluster[cluster_id],  
        'LATITUDE': new_row_coords[idx][0],  
        'LONGITUDE': new_row_coords[idx][1],  
        'Usage_predictions': 0,  
        'Cluster': cluster_id,  
        'Weight_initial': 0  
    }
    
    # Appending the new row to the new_rows DataFrame
    new_rows = pd.concat([new_rows, pd.DataFrame([new_row])], ignore_index=True)

# Appending the new rows to the original DataFrame
data = pd.concat([data, new_rows], ignore_index=True)

# List of station IDs to remove
stations_to_remove = [116, 111, 89, 61, 104]

# Filtering out rows with the specified station IDs
data = data[~data['STATION ID'].isin(stations_to_remove)]

# Printing the updated DataFrame with new stations added and low usage stations removed
print('The updated dataset with new stations added and low usage stations removed is:\n', data)
The updated dataset with new stations added and low usage stations removed is:
    STATION ID                               NAME  BIKE_STANDS  \
0           1                      CLARENDON ROW        31.00   
1           2                 BLESSINGTON STREET        20.00   
2           6                 CHRISTCHURCH PLACE        20.00   
3           8                  CUSTOM HOUSE QUAY        30.00   
4           9                   EXCHEQUER STREET        24.00   
5          10                        DAME STREET        16.00   
6          12                      ECCLES STREET        20.00   
7          13            FITZWILLIAM SQUARE WEST        30.00   
8          14                FOWNES STREET UPPER        30.00   
9          15                   HARDWICKE STREET        16.00   
10         16                       GEORGES QUAY        20.00   
11         17                        GOLDEN LANE        20.00   
12         19                      HERBERT PLACE        30.00   
13         20                  JAMES STREET EAST        30.00   
14         21              LEINSTER STREET SOUTH        30.00   
15         22                    TOWNSEND STREET        20.00   
16         23                       CUSTOM HOUSE        30.00   
17         27                  MOLESWORTH STREET        20.00   
18         28               MOUNTJOY SQUARE WEST        30.00   
19         29                  ORMOND QUAY UPPER        29.00   
20         30               PARNELL SQUARE NORTH        20.00   
21         32                      PEARSE STREET        30.00   
22         33  PRINCES STREET / O'CONNELL STREET        23.00   
23         38                      TALBOT STREET        40.00   
24         39                     WILTON TERRACE        20.00   
25         44              UPPER SHERRARD STREET        30.00   
26         47                     HERBERT STREET        40.00   
27         59               DENMARK STREET GREAT        20.00   
28         60                NORTH CIRCULAR ROAD        30.00   
30         64                    SANDWITH STREET        40.00   
31         78                     MATER HOSPITAL        40.00   
32         79                 ECCLES STREET EAST        27.00   
33         86                    PARKGATE STREET        38.00   
35         92             HEUSTON BRIDGE (NORTH)        40.00   
36         93          HEUSTON STATION (CENTRAL)        40.00   
37         94         HEUSTON STATION (CAR PARK)        40.00   
38         99                          CITY QUAY        30.00   
39        100             HEUSTON BRIDGE (SOUTH)        25.00   
40        102                        WESTERN WAY        40.00   
41        103         GRANGEGORMAN LOWER (SOUTH)        40.00   
43        105         GRANGEGORMAN LOWER (NORTH)        36.00   
44        110                  PHIBSBOROUGH ROAD        40.00   
46        112  NORTH CIRCULAR ROAD (O'CONNELL'S)        30.00   
47        114              WILTON TERRACE (PARK)        40.00   
49      new_1                              new_1        36.60   
50      new_2                              new_2        23.75   
51      new_3                              new_3        29.30   

   AVAILABLE_BIKE_STANDS  AVAILABLE_BIKES   LATITUDE  LONGITUDE  \
0              16.961644        14.024658  53.340900  -6.262500   
1              14.687671         5.304110  53.356800  -6.268140   
2               13.50411         6.432877  53.343400  -6.270120   
3              15.736986         9.128767  53.347900  -6.248050   
4              13.287671        10.652055  53.343000  -6.263580   
5               6.967123         9.005479  53.344000  -6.266800   
6              14.517808         5.482192  53.359200  -6.269780   
7               24.90137         4.830137  53.336100  -6.252820   
8              19.487671        10.476712  53.344600  -6.263370   
9              13.054795         2.161644  53.355500  -6.264420   
10             10.342466         9.646575  53.347500  -6.252190   
11             14.454795         5.528767  53.340800  -6.267730   
12             21.334247         8.383562  53.334400  -6.245570   
13             24.931507         5.063014  53.336600  -6.248110   
14             17.424658        12.539726  53.342200  -6.254490   
15              8.967123        10.701370  53.345900  -6.254610   
16             13.920548        15.695890  53.348300  -6.254660   
17             12.531507         7.449315  53.341300  -6.258120   
18             17.087671        12.865753  53.356300  -6.258590   
19             14.936986        14.008219  53.346100  -6.268000   
20             16.131507         3.789041  53.353500  -6.265310   
21             17.252055        12.726027  53.344300  -6.250430   
22              10.29589        12.657534  53.349000  -6.260310   
23             15.484932        24.400000  53.351000  -6.252940   
24             11.156164         8.643836  53.332400  -6.252720   
25             20.673973         9.315068  53.358400  -6.260640   
26             34.216438         5.389041  53.335700  -6.245510   
27              15.30137         4.690411  53.355600  -6.261400   
28             16.087671        13.849315  53.359600  -6.260350   
30             24.682192        15.284932  53.345200  -6.247160   
31             26.926027        13.035616  53.360000  -6.264830   
32             22.813699         4.164384  53.358100  -6.265600   
33             10.345205        27.630137  53.348000  -6.291800   
35              5.506849        34.487671  53.347800  -6.292430   
36              9.876712        30.106849  53.346600  -6.296920   
37             20.030137        19.969863  53.347000  -6.297800   
38             25.449315         4.528767  53.346600  -6.246150   
39              7.375342        17.616438  53.347100  -6.292040   
40             30.019178         9.641096  53.354900  -6.269420   
41             34.964384         5.024658  53.354700  -6.278680   
43             28.194521         7.802740  53.356000  -6.278380   
44             28.452055        11.526027  53.356300  -6.273720   
46             19.668493        10.317808  53.357800  -6.251560   
47             27.139726        12.490411  53.333700  -6.248340   
49                     0        36.600000  53.346460  -6.292984   
50                     0        23.750000  53.344208  -6.260145   
51                     0        29.300000  53.347426  -6.255404   

   Usage_predictions cluster_kmeans Cluster Weight_initial  
0          50.898496              1       2      34.348425  
1          18.881614              4       4      26.694436  
2           29.91091              2       2       15.75505  
3          50.995859              1       3      21.009641  
4          58.496224              1       2      26.088431  
5          66.885357              5       2      22.055728  
6          20.505096              4       4      27.590686  
7          23.652421              2       6      23.453641  
8          53.449347              1       2      25.658991  
9          18.046534              4       5       7.801421  
10         55.528331              1       3      22.201364  
11         27.798568              2       2      13.540754  
12          44.19326              3       6      40.707965  
13         32.093773              0       6      24.584418  
14         57.525088              1       3      28.859882  
15         61.522133              5       3      24.628949  
16         62.728942              5       3       36.12372  
17           58.0668              1       2      18.244455  
18         31.692939              0       5      46.432794  
19         54.602995              1       2      34.308165  
20         18.577317              4       4      19.069424  
21         53.227569              1       3       29.28865  
22         66.261155              5       3      29.131015  
23         57.779409              1       3      56.156021  
24          29.47649              2       6      41.971774  
25         19.381672              4       5      33.618292  
26         26.421215              2       6      26.167505  
27         20.258112              4       5      16.927799  
28         31.614857              0       5       49.98249  
30         42.025642              3       3      35.177907  
31         26.857023              2       5      47.045834  
32          17.44585              4       5      15.029354  
33         53.276721              1       1      38.951373  
35         68.527906              5       1      48.618729  
36         58.993286              1       1      42.442899  
37         40.448883              3       1      28.152361  
38         25.388022              2       3       10.42285  
39         61.496249              5       1      24.834638  
40         23.102577              2       4       48.52155  
41         15.908157              4       4      25.288015  
43         19.402861              4       4      39.269501  
44         27.726214              2       4      58.008003  
46         28.730846              2       5      37.237203  
47         40.782061              3       6      60.649546  
49                 0            NaN       1              0  
50                 0            NaN       2              0  
51                 0            NaN       3              0  
In [15]:
## Calculating the new weights of the stations in each cluster

# Calculating new total bike stands in each cluster
total_bike_stands_cluster_new = data.groupby('Cluster')['BIKE_STANDS'].sum()
print('The new total bike stands in each cluster is:\n', total_bike_stands_cluster_new)

# Calculating new total available bikes in each cluster using 'sum()'
total_available_bikes_cluster_new = data.groupby('Cluster')['AVAILABLE_BIKES'].sum()
print('\nThe new total available bikes in each cluster is:\n', total_available_bikes_cluster_new)

# Iterating over unique cluster IDs
for cluster_id in data['Cluster'].unique():
    # Filtering data for stations belonging to the current cluster
    cluster_data = data[data['Cluster'] == cluster_id]
    
    # Calculating new weights ('Weight_new') for stations in the current cluster
    data.loc[data['Cluster'] == cluster_id, 'Weight_new'] = \
        cluster_data['AVAILABLE_BIKES'] * total_bike_stands_cluster_new.loc[cluster_id] / total_available_bikes_cluster_new.loc[cluster_id]

# Printing the head of the DataFrame with 'Weight_new' column
print('\nThe dataset with new weight values is:\n', data.head())
The new total bike stands in each cluster is:
 Cluster
1    219.60
2    213.75
3    322.30
4    216.00
5    223.00
6    190.00
Name: BIKE_STANDS, dtype: float64

The new total available bikes in each cluster is:
 Cluster
1    166.410959
2    101.328082
3    156.609589
4     48.569863
5     70.400000
6     44.800000
Name: AVAILABLE_BIKES, dtype: float64

The dataset with new weight values is:
   STATION ID                NAME  BIKE_STANDS AVAILABLE_BIKE_STANDS  \
0          1       CLARENDON ROW         31.0             16.961644   
1          2  BLESSINGTON STREET         20.0             14.687671   
2          6  CHRISTCHURCH PLACE         20.0              13.50411   
3          8   CUSTOM HOUSE QUAY         30.0             15.736986   
4          9    EXCHEQUER STREET         24.0             13.287671   

   AVAILABLE_BIKES  LATITUDE  LONGITUDE Usage_predictions cluster_kmeans  \
0        14.024658   53.3409   -6.26250         50.898496              1   
1         5.304110   53.3568   -6.26814         18.881614              4   
2         6.432877   53.3434   -6.27012          29.91091              2   
3         9.128767   53.3479   -6.24805         50.995859              1   
4        10.652055   53.3430   -6.26358         58.496224              1   

  Cluster Weight_initial  Weight_new  
0       2      34.348425   29.584795  
1       4      26.694436   23.588448  
2       2       15.75505   13.570053  
3       3      21.009641   18.786855  
4       2      26.088431   22.470343  

We now run various machine learning models to understand which model makes better predictions of usage level given the set of features.¶

In [16]:
# Setting the target variable
y = data['Usage_predictions'][:-3]

# Setting the feature variable
X_new = data[['Cluster', 'Weight_initial', 'AVAILABLE_BIKES']][:-3]

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

models = {
    "Ridge Base": Ridge(),
    "Lasso Base": Lasso(),
    "ElasticNet Base": ElasticNet(),
    'Linear Regression': LinearRegression(), # Linear Regression
    'Random Forest Regressor': RandomForestRegressor(random_state=42) # Random Forest Regressor
}

# Calculating the MSE value for all the models used
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
    print(f"{name} (MSE_Test) = {mse_test}")
    print(f"{name} (MSE_Train) = {mse_train}")
Ridge Base (MSE_Test) = 132.31537841706378
Ridge Base (MSE_Train) = 140.67315191054044
Lasso Base (MSE_Test) = 131.73887782082974
Lasso Base (MSE_Train) = 141.2981387637917
ElasticNet Base (MSE_Test) = 134.1657013607262
ElasticNet Base (MSE_Train) = 143.27351553912357
Linear Regression (MSE_Test) = 132.2945433689032
Linear Regression (MSE_Train) = 140.65894726553083
Random Forest Regressor (MSE_Test) = 71.91473220266676
Random Forest Regressor (MSE_Train) = 10.302148113753274

On running various regression models which makes predictions for the usage percentage of the stations, we see that the Random Forest Regressor Model gives the best result. We therefore, use Random Forest Regressor Model for our case.¶

In [17]:
## Making new usage predictions for all stations

# Selecting features for prediction
X_pred = data[['Cluster', 'Weight_new', 'AVAILABLE_BIKES']]

# Renaming the 'Weight_new' column to 'Weight_initial' as per your requirement
X_pred = X_pred.rename(columns={'Weight_new': 'Weight_initial'})

# Making predictions using the Random Forest Regressor model
data['Usage_predictions_new'] = models['Random Forest Regressor'].predict(X_pred)

# Printing the head of the DataFrame with new usage predictions column
print('\nThe dataset with new usage prediction values is:\n', data.head())
The dataset with new usage prediction values is:
   STATION ID                NAME  BIKE_STANDS AVAILABLE_BIKE_STANDS  \
0          1       CLARENDON ROW         31.0             16.961644   
1          2  BLESSINGTON STREET         20.0             14.687671   
2          6  CHRISTCHURCH PLACE         20.0              13.50411   
3          8   CUSTOM HOUSE QUAY         30.0             15.736986   
4          9    EXCHEQUER STREET         24.0             13.287671   

   AVAILABLE_BIKES  LATITUDE  LONGITUDE Usage_predictions cluster_kmeans  \
0        14.024658   53.3409   -6.26250         50.898496              1   
1         5.304110   53.3568   -6.26814         18.881614              4   
2         6.432877   53.3434   -6.27012          29.91091              2   
3         9.128767   53.3479   -6.24805         50.995859              1   
4        10.652055   53.3430   -6.26358         58.496224              1   

  Cluster Weight_initial  Weight_new  Usage_predictions_new  
0       2      34.348425   29.584795              52.804455  
1       4      26.694436   23.588448              20.235560  
2       2       15.75505   13.570053              33.546574  
3       3      21.009641   18.786855              53.367074  
4       2      26.088431   22.470343              58.369274  
In [18]:
# Filtering out rows with the specified station IDs
data_use_new = data_pred[~data_pred['STATION ID'].isin(stations_to_remove)]

# Printing the updated DataFrame
print('The updated dataset with specified stations removed and new stations added is:\n', data_use_new)
The updated dataset with specified stations removed and new stations added is:
      STATION ID                               NAME  BIKE_STANDS  \
0             1                      CLARENDON ROW         31.0   
1             2                 BLESSINGTON STREET         20.0   
2             3                      BOLTON STREET         20.0   
3             4                       GREEK STREET         20.0   
4             5                   CHARLEMONT PLACE         40.0   
..          ...                                ...          ...   
108         112  NORTH CIRCULAR ROAD (O'CONNELL'S)         30.0   
109         113               MERRION SQUARE SOUTH         40.0   
110         114              WILTON TERRACE (PARK)         40.0   
111         115                   KILLARNEY STREET         30.0   
113         117                  HANOVER QUAY EAST         40.0   

     AVAILABLE_BIKE_STANDS  AVAILABLE_BIKES  LATITUDE  LONGITUDE  \
0                16.961644        14.024658   53.3409   -6.26250   
1                14.687671         5.304110   53.3568   -6.26814   
2                12.476712         7.506849   53.3512   -6.26986   
3                11.194521         8.775342   53.3469   -6.27298   
4                12.410959        27.553425   53.3307   -6.26018   
..                     ...              ...       ...        ...   
108              19.668493        10.317808   53.3578   -6.25156   
109              30.624658         7.463014   53.3386   -6.24861   
110              27.139726        12.490411   53.3337   -6.24834   
111               8.564384        21.402740   53.3548   -6.24758   
113              31.742466         8.241096   53.3437   -6.23175   

     Usage_predictions cluster_kmeans Cluster  
0            50.898496              1       2  
1            18.881614              4       4  
2            37.645840              0    None  
3            47.050910              3    None  
4            42.660139              3    None  
..                 ...            ...     ...  
108          28.730846              2       5  
109          27.959888              2    None  
110          40.782061              3       6  
111          46.850345              3    None  
113          30.305323              2    None  

[109 rows x 10 columns]
In [19]:
# Plotting heatmap corresponding to daily average usage level of bikes for each station (before)

fig = px.scatter_mapbox(data_use_new, 
                        lat='LATITUDE', 
                        lon='LONGITUDE',
                        hover_name='NAME',
                        hover_data=['Usage_predictions', 'STATION ID'],
                        color='Usage_predictions',
                        size='Usage_predictions',
                        size_max=20,
                        color_continuous_scale='Jet',
                        mapbox_style="carto-positron",
                        zoom=12)

# Updating layout
fig.update_layout(title='Daily Average Usage Percentage Heatmap for Dublin Bikes (before)')

# Showing the figure
fig.show()
In [20]:
# Merging the 'Usage_predictions_new' column from the 'data' DataFrame to the 'data_use_new' DataFrame based on the 'STATION ID' column
data_use_new = data_use_new.merge(data[['STATION ID', 'Usage_predictions_new']], on='STATION ID', how='left')

# Filling empty values in 'Usage_predictions_new' column with values from 'Usage_predictions' column
data_use_new['Usage_predictions_new'].fillna(data_use_new['Usage_predictions'], inplace=True)

# Station IDs to add
stations_to_add = ['new_1', 'new_2', 'new_3']

# Filtering the data DataFrame for rows where 'STATION ID' is in stations_to_add
rows_to_add = data[data['STATION ID'].isin(stations_to_add)]

# Adding these rows to data_use_new using concat
data_use_new = pd.concat([data_use_new, rows_to_add], ignore_index=True)

# Printing the updated 'data_use_new' DataFrame
print(data_use_new)
    STATION ID                NAME  BIKE_STANDS AVAILABLE_BIKE_STANDS  \
0            1       CLARENDON ROW        31.00             16.961644   
1            2  BLESSINGTON STREET        20.00             14.687671   
2            3       BOLTON STREET        20.00             12.476712   
3            4        GREEK STREET        20.00             11.194521   
4            5    CHARLEMONT PLACE        40.00             12.410959   
..         ...                 ...          ...                   ...   
107        115    KILLARNEY STREET        30.00              8.564384   
108        117   HANOVER QUAY EAST        40.00             31.742466   
109      new_1               new_1        36.60                     0   
110      new_2               new_2        23.75                     0   
111      new_3               new_3        29.30                     0   

     AVAILABLE_BIKES   LATITUDE  LONGITUDE Usage_predictions cluster_kmeans  \
0          14.024658  53.340900  -6.262500         50.898496              1   
1           5.304110  53.356800  -6.268140         18.881614              4   
2           7.506849  53.351200  -6.269860          37.64584              0   
3           8.775342  53.346900  -6.272980          47.05091              3   
4          27.553425  53.330700  -6.260180         42.660139              3   
..               ...        ...        ...               ...            ...   
107        21.402740  53.354800  -6.247580         46.850345              3   
108         8.241096  53.343700  -6.231750         30.305323              2   
109        36.600000  53.346460  -6.292984                 0            NaN   
110        23.750000  53.344208  -6.260145                 0            NaN   
111        29.300000  53.347426  -6.255404                 0            NaN   

    Cluster  Usage_predictions_new Weight_initial  Weight_new  
0         2              52.804455            NaN         NaN  
1         4              20.235560            NaN         NaN  
2      None              37.645840            NaN         NaN  
3      None              47.050910            NaN         NaN  
4      None              42.660139            NaN         NaN  
..      ...                    ...            ...         ...  
107    None              46.850345            NaN         NaN  
108    None              30.305323            NaN         NaN  
109       1              63.407113              0   48.298261  
110       2              57.571571              0   50.100252  
111       3              58.108833              0   60.298926  

[112 rows x 13 columns]
In [21]:
# Filter data for new stations
data_new_stations = data_use_new[data_use_new['STATION ID'].isin(['new_1', 'new_2', 'new_3'])]

# Plotting heatmap corresponding to daily average usage level of bikes for each station (after)
fig = px.scatter_mapbox(data_use_new, 
                        lat='LATITUDE', 
                        lon='LONGITUDE',
                        hover_name='NAME',
                        hover_data=['Usage_predictions_new', 'STATION ID'],
                        color='Usage_predictions_new',
                        size='Usage_predictions_new',
                        size_max=20,
                        color_continuous_scale='Jet',
                        mapbox_style="carto-positron",
                        zoom=12,
                        title='Daily Average Usage Percentage Heatmap for Dublin Bikes (after)')

# Adding black circle markers for new stations for ease of identification
new_stations_trace = go.Scattermapbox(
    lat=data_new_stations['LATITUDE'],
    lon=data_new_stations['LONGITUDE'],
    mode='markers',
    marker=dict(symbol='circle', size=15, color='white'),
    hoverinfo='text',
    text=data_new_stations['NAME']
)

fig.add_trace(new_stations_trace)

# Showing the figure
fig.show()

The white circles represent the new stations that have been added to the respective clusters. The locations of these stations have been done by assessing possible installation site using Google Streetview to optimise user experience.¶

In [22]:
# Saving the final dataset with new usage predictions
data_use_new.to_csv('dataset_with_new_predictions.csv', index=False)

# Selecting specific columns to display
grouped = data_use_new.groupby('Cluster').apply(lambda x: x[['Cluster', 'STATION ID', 'Usage_predictions', 'Usage_predictions_new']]).sort_values(by='Cluster').reset_index(drop=True)

# Printing the grouped data
print('Grouped data by Clusters with specific columns:\n', grouped)
Grouped data by Clusters with specific columns:
    Cluster STATION ID Usage_predictions  Usage_predictions_new
0        1         93         58.993286              55.402688
1        1      new_1                 0              63.407113
2        1        100         61.496249              46.221735
3        1         86         53.276721              54.832878
4        1         92         68.527906              62.979385
5        1         94         40.448883              46.153871
6        2      new_2                 0              57.571571
7        2         17         27.798568              33.103687
8        2         27           58.0668              43.154404
9        2          1         50.898496              52.804455
10       2         10         66.885357              57.484033
11       2          9         58.496224              58.369274
12       2          6          29.91091              33.546574
13       2         14         53.449347              59.072910
14       2         29         54.602995              53.556604
15       3         99         25.388022              32.183186
16       3         64         42.025642              50.658130
17       3         33         66.261155              61.805066
18       3         32         53.227569              57.805982
19       3         38         57.779409              58.553458
20       3         16         55.528331              53.474072
21       3          8         50.995859              53.367074
22       3         21         57.525088              60.319985
23       3      new_3                 0              58.108833
24       3         23         62.728942              51.847799
25       3         22         61.522133              58.787304
26       4         30         18.577317              18.489099
27       4        103         15.908157              17.722522
28       4        102         23.102577              26.570461
29       4         12         20.505096              21.478624
30       4        105         19.402861              25.111618
31       4        110         27.726214              28.719501
32       4          2         18.881614              20.235560
33       5         78         26.857023              30.638531
34       5         28         31.692939              31.078592
35       5         15         18.046534              18.424242
36       5         60         31.614857              31.076083
37       5         59         20.258112              19.474751
38       5         44         19.381672              26.580429
39       5         79          17.44585              18.341248
40       5        112         28.730846              28.351047
41       6        114         40.782061              31.839393
42       6         39          29.47649              29.180011
43       6         20         32.093773              27.292227
44       6         47         26.421215              27.522670
45       6         19          44.19326              28.797845
46       6         13         23.652421              23.513533
C:\Users\Dell\AppData\Local\Temp\ipykernel_43472\1055471384.py:5: FutureWarning:

Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)

To adopt the future behavior and silence this warning, use 

	>>> .groupby(..., group_keys=True)

We can notice that the redistribution of load on each station and the high usage levels for top stations have reduced. Furthermore, due to removal of five stations, we see that the usage level of nearby stations in the cluster have increased.¶